#### Distinguishing Exposure to Secondhand and Thirdhand Tobacco Smoke among U.S. Children Using Machine Learning: NHANES 2013−2016 ####
# https://pubs.acs.org/doi/10.1021/acs.est.2c08121?ref=PDF 原文链接
# 2023年3月30日08:45:11 进行复现论文paper数据
# 本文主要讨论区分二手烟和三手烟暴露区分开的问题  选取的数据是ehanes 2013-2014 2015-2016年数据
# 文章摘要说了关于 server版本
library(haven) #用于读取xpt
library(plyr) #用于数据处理包
library(dplyr) #用于数据处理包
library(arsenal) ##用于tableby
library(survey) #用于加权分析
library(stringr) #用于字符处理
library(gtsummary)
library(missRanger)
library(mltools)
library(caret)
library(randomForest)
library(sjPlot)
library(sjlabelled)
library(sjmisc)
library(ggplot2)



agerange317dropna <- source
agerange317dropna$allGroup <-as.factor(agerange317dropna$allGroup)
tbl_summary(agerange317dropna,
            missing = 'no',
            # 通过属性分组
            by = allGroup,
            # 进行设置显示维度
            include = c(RIDAGEYR,Sex,race,FPL,DMDHHSIZ,childhouseholdmembers,familyhomeownershipstatus,HOD050),
            # 批量给变量起别名
            label = list(RIDAGEYR ~ "child age, M (SE)",
                         Sex ~ "child sex",
                         race ~ "child race/ethnicity",
                         FPL ~ "FPL",
                         DMDHHSIZ ~ "no. household members, M (SE)",
                         childhouseholdmembers ~ "no. child household members",
                         familyhomeownershipstatus ~ "family home ownership status",
                         HOD050 ~ "no. household rooms, M (SE)"
                         ),

            statistic = list(
              # 分别对应数值型和分类变量 后是需要显示表达式
              all_continuous() ~ "{mean}",
              all_categorical() ~ "{n} ({p} ) "
            ),
            digits = all_continuous() ~ 3
            )%>%
  #add_n() %>% # 添加非NA观测值个数
  # 这里p值不能用p 根据原文说的是加权得来的 需要用 tbl_svysummay进行 这里除了p值 其他是正确的
  # add_p() %>% # 添加P值
  add_overall() %>%
  #add_significance_stars(hide_se = FALSE,hide_p = TRUE)%>%
  #modify_header(std.error = "**SE**")
  add_ci() %>%
  modify_column_merge(pattern = "{stat_0} ({ci_stat_0})")%>%
  modify_column_merge(pattern = "{stat_1} ({ci_stat_1})")%>%
  modify_column_merge(pattern = "{stat_2} ({ci_stat_2})")%>%
  modify_column_merge(pattern = "{stat_3} ({ci_stat_3})")%>%
  #modify_spanning_header(c("stat_2", "stat_3","stat_4") ~ "**TSE group**")%>%
  modify_header(label = "**characteristic**",all_stat_cols() ~ "**{level}**<br>N = {n} <br> n(% [95% CI]")%>%flextable::save_as_html('Table1_Result.html')

  

# show_header_names(tblchild1)
# 表格2 和上面一样 暂时不知道怎么加95%
# 进行复现表格2 部分数据进行转换
# number of household smokers -> smokersize 


tbl_summary(agerange317dropna,
                      # 通过属性分组
                      by = allGroup,
                      # 进行设置显示维度
                      include = c(smokersize,cartse,otherhomeTSE,restaurantTSE,otherindoorareaTSE),
                      # 批量给变量起别名
                      label = list(smokersize ~ "number of household smokers",
                                   cartse ~ "car TSE",
                                   otherhomeTSE ~ "other home TSE",
                                   restaurantTSE ~ "restaurant TSE",
                                   otherindoorareaTSE ~ "other indoor area TSE"

                      ),

                      statistic = list(
                        # 分别对应数值型和分类变量 后是需要显示表达式
                        all_continuous() ~ "{mean}±{sd}",
                        all_categorical() ~ "{n} ({p}%)"
                      ),
                      digits = all_continuous() ~ 3
   )%>%
  #add_n() %>% # 添加非NA观测值个数
  # 这里p值不能用p 根据原文说的是加权得来的 需要用 tbl_svysummay进行 这里除了p值 其他是正确的
  # add_p() %>% # 添加P值
  add_overall() %>%
  add_ci(pattern = "{stat} ({ci})") %>%
  modify_spanning_header(c("stat_2", "stat_3","stat_4") ~ "**TSE group**")%>%
  modify_header(label = "**TSE variable**")%>%flextable::save_as_html('Table2_Result.html')

colnames(agerange317dropna)
#urinary TNE2
agerange317dropna$urinaryTNE2 <- (agerange317dropna$URXCOTT /176.2151) +(agerange317dropna$URXHCTT /192.2145  )
#NNAL/TNE2 
agerange317dropna$NNAL_TNE2 <- (agerange317dropna$URXNAL /agerange317dropna$urinaryTNE2)
#2-hydroxyfluorene (ng/L) 
agerange317dropna$twohydroxyfluorene<- agerange317dropna$URXP04 
#3-hydroxyfluorene (ng/L)
agerange317dropna$threehydroxyfluorene<- agerange317dropna$URXP03 
#2-hydroxyfluorene/TNE2 
agerange317dropna$twohydroxyfluorene_tne2<- agerange317dropna$twohydroxyfluorene /agerange317dropna$urinaryTNE2
#3-hydroxyfluorene/TNE2 
agerange317dropna$threehydroxyfluorene_tne2<- agerange317dropna$threehydroxyfluorene /agerange317dropna$urinaryTNE2
# 2CyEMA (ng/mL) 
agerange317dropna$twoCyEMA<- agerange317dropna$URXCYM 
# 2CyEMA/TNE2
agerange317dropna$twoCyEMA_TNE2<- agerange317dropna$twoCyEMA /agerange317dropna$urinaryTNE2
colnames(agerange317dropna)
# 血清可替宁->lbxcot
# 羟可替宁，血清 (ng/mL)->  lbxhct
# 总羟基可替宁，尿液 (ng/mL) ->  urxhctt
# 总可替宁，尿液 (ng/mL) ->  urxcott
# NNAL，尿液 (ng/mL) -> urxnal --tsna_h.xpt
# 2-羟基芴 (ng/L) -> urxp04
# 3-羟基芴 (ng/L) -> urxp03
# N-乙酰基-S-(2-氰乙基)-L-半胱氨酸(ng/mL) -> urxcym --  2CyEMA
tbl_summary(agerange317dropna,
            missing = 'no',
            # 通过属性分组
            by = allGroup,
            # 进行设置显示维度
            include = c(LBXCOT,LBXHCT,urinaryTNE2,URXNAL,NNAL_TNE2,twohydroxyfluorene,threehydroxyfluorene,twohydroxyfluorene_tne2,threehydroxyfluorene_tne2,twoCyEMA,twoCyEMA_TNE2),
            # 批量给变量起别名
            label = list(
              LBXCOT ~ "serum cotinine (ng/mL)",
              LBXHCT ~ "serum hydroxycotinine (ng/mL)",
              urinaryTNE2 ~ "urinary TNE2 (nmol/mL)",
              URXNAL ~ "NNAL (pg/mL)",
              NNAL_TNE2 ~ "NNAL/TNE2",
              twohydroxyfluorene ~ "2-hydroxyfluorene (ng/L)",
              threehydroxyfluorene ~ "3-hydroxyfluorene (ng/L)",
              twohydroxyfluorene_tne2 ~ "2-hydroxyfluorene/TNE2",
              threehydroxyfluorene_tne2 ~ "3-hydroxyfluorene/TNE2",
              twoCyEMA ~ "2CyEMA (ng/mL)",
              twoCyEMA_TNE2 ~ "2CyEMA/TNE2"),

            statistic = list(
              # 分别对应数值型和分类变量 后是需要显示表达式
              all_continuous() ~ "{mean}"
             # all_categorical() ~ "{n} ({p}) "
            ),
            digits = all_continuous() ~ 2
            )%>%
  add_n(statistic = "{n}({p})") %>% # 添加非NA观测值个数
  # 这里p值不能用p 根据原文说的是加权得来的 需要用 tbl_svysummay进行 这里除了p值 其他是正确的
  # add_p() %>% # 添加P值
  add_overall() %>%
  #add_significance_stars(hide_se = FALSE,hide_p = TRUE)%>%
  #modify_header(std.error = "**SE**")
  add_ci() %>%
  # modify_column_merge(pattern = "{stat_0} ({ci_stat_0})")%>%
  modify_column_merge(pattern = "{stat_1} ({ci_stat_1})")%>%
  modify_column_merge(pattern = "{stat_2} ({ci_stat_2})")%>%
  modify_column_merge(pattern = "{stat_3} ({ci_stat_3})")%>%
  modify_spanning_header(c("stat_1", "stat_2","stat_3") ~ "**TSE group membership**")%>%
  modify_header(label = "**biomarker variable**",all_stat_cols() ~ "**{level}**<br>GeoM (95% CI) ")%>% add_overall()%>%
  flextable::save_as_html('Table3_Result.html')

# 
# show_header_names(tblchild3)
# tblchild3
#agerange317dropna <- missRanger(agerange317dropna, pmm.k = 3, num.trees = 100)
#colnames(agerange317dropna)

# 主要表格4  随机森林模型 
# 还需要学习gini impurity 基尼系数 
# 16个重要变量进行测试
#  number of house smoker -> smokersize
#  serum cotinine ->LBXCOT
#  serum Hydroxycotinine ->LBXHCT
#  urinary nnal -> URXNAL
#  3-hydroxyfluorene/TNE2  -> threehydroxyfluorene_tne2
#  urinary TNE2 -> urinaryTNE2
#  2-hydroxyfluorene/TNE2  -> twohydroxyfluorene_tne2
#  2CyEMA/TNE2 -> twoCyEMA_TNE2
#  NNAL/TNE2  ->NNAL_TNE2
#  2-hydroxyfluorene -> twohydroxyfluorene
#  car tse ->cartse
#  3-hydroxyfluorene -> threehydroxyfluorene
#  2CyEMA  -> twoCyEMA
#  other home tse -> otherhomeTSE
#  otherindoorareaTSE -> otherindoorareaTSE
#  restaurantTSE -> restaurantTSE

colnames(agerange317dropna)
finalModel <- agerange317dropna[,c( 'smokersize','LBXCOT','LBXHCT','URXNAL','threehydroxyfluorene_tne2','urinaryTNE2',
                                   'twohydroxyfluorene_tne2','twoCyEMA_TNE2','NNAL_TNE2','twohydroxyfluorene','cartse','threehydroxyfluorene',
                                   'twoCyEMA','otherhomeTSE','otherindoorareaTSE','restaurantTSE','allGroup')]

# 文章说的提取
finalModel <- missRanger(finalModel, pmm.k = 2, num.trees = 500)
# table(finalModel$allGroup)
# dim(finalModel)
# 根据数据进行分析 拆分数据 80 训练 20 验证
# set.seed(40)
# trains <- createDataPartition(y = finalModel$allGroup,p = 0.8,list = FALSE)
# trainsdata <- finalModel[trains,]
# testdata <- finalModel[-trains,]
# 这块说的是下面六个模型进行拿着20%的数据进行验证 拿到结果进型六个额外模型验证 直接验证吧 没时间了
# all reported TSE and biomarker variables -> finalModel
# all reported TSE variables -> finalModel1
# reported TSE variables excluding number of household smokers -> finalMode2
# 所有的生物标志物排除了比值的
# all biomarker variables excluding ratios -> finalModel3
# biomarker and biomarker ratio variables excluding serum cotinine, serum hydroxycotinine, urinary TNE2, and urinary NNAL -> finalModel4
# all biomarker and biomarker ratio variables -> finalModel5
# all biomarker ratio variables -> finalModel6
# 首先生成这六额外个模型

finalModel1 <- finalModel[,c('otherhomeTSE','otherindoorareaTSE','restaurantTSE','cartse','smokersize','allGroup')]
finalModel2 <- finalModel[,c('otherhomeTSE','otherindoorareaTSE','restaurantTSE','cartse','allGroup')]
finalModel3 <- finalModel[,c('LBXCOT','LBXHCT','urinaryTNE2','URXNAL','twohydroxyfluorene','threehydroxyfluorene', 'twoCyEMA', 'allGroup')]
finalModel4 <- finalModel[,c('twohydroxyfluorene','threehydroxyfluorene', 'twoCyEMA','NNAL_TNE2','threehydroxyfluorene_tne2','twohydroxyfluorene_tne2','twoCyEMA_TNE2', 'allGroup')]
finalModel5 <- finalModel[,c('LBXCOT','LBXHCT','urinaryTNE2','URXNAL','twohydroxyfluorene','threehydroxyfluorene', 'twoCyEMA','NNAL_TNE2','threehydroxyfluorene_tne2','twohydroxyfluorene_tne2','twoCyEMA_TNE2', 'allGroup')]
finalModel6 <- finalModel[,c('NNAL_TNE2','threehydroxyfluorene_tne2','twohydroxyfluorene_tne2','twoCyEMA_TNE2', 'allGroup')]

set.seed(40)
finalModeltreeResult <- randomForest(allGroup ~ .,
                                 data = finalModel,
                                 ntree = 500,
                                 mtry = 16, # 自变量最大数
                                 importance = T)
# 下面每个预测
set.seed(40)
finalModeltreeResult1 <- randomForest(allGroup ~ .,
                                     data = finalModel1,
                                     ntree = 500,
                                     mtry = 5, # 自变量最大数
                                     importance = T)
set.seed(40)
finalModeltreeResult2 <- randomForest(allGroup ~ .,
                                     data = finalModel2,
                                     ntree = 500,
                                     mtry = 4, # 自变量最大数
                                     importance = T)
set.seed(40)
finalModeltreeResult3 <- randomForest(allGroup ~ .,
                                     data = finalModel3,
                                     ntree = 500,
                                     mtry = 7, # 自变量最大数
                                     importance = T)
set.seed(40)
finalModeltreeResult4 <- randomForest(allGroup ~ .,
                                     data = finalModel4,
                                     ntree = 500,
                                     mtry = 7, # 自变量最大数
                                     importance = T)
set.seed(40)
finalModeltreeResult5 <- randomForest(allGroup ~ .,
                                     data = finalModel5,
                                     ntree = 500,
                                     mtry = 11, # 自变量最大数
                                     importance = T)
set.seed(40)
finalModeltreeResult6 <- randomForest(allGroup ~ .,
                                      data = finalModel6,
                                      ntree = 500,
                                      mtry = 4, # 自变量最大数
                                      importance = T)

#可以认为直接拿着  OOB estimate of  error  当做overall 剩下那几个class error 当做误差 也就是 准确度进行计算
board <- read.csv('/home/jar/zxjar/paper_helper/userdata/0/85/bootBoard.csv')

#varImpPlot(finalModeltreeResult,main = "variable Importance plot",type = 2)%>%flextable::save_as_image(path = 'aaa.png')



# 进行拼接表格数据 2023年5月23日13:44:21
conf <- finalModeltreeResult$confusion[,-ncol(finalModeltreeResult$confusion)]
oob <-  round((sum(diag(conf))/sum(conf)),2)
oveall <-  oob * 100
numva<-finalModeltreeResult$mtry
as <-as.data.frame(finalModeltreeResult$confusion)
board[1,2] <-numva
board[1,3] <-oveall
board[1,4] <-(1-as['NEG','class.error'])*100
board[1,5] <-(1-as['TEG','class.error'])*100
board[1,6] <-(1-as['MEG','class.error'])*100
# 完成一个
conf <- finalModeltreeResult1$confusion[,-ncol(finalModeltreeResult1$confusion)]
oob <-  round((sum(diag(conf))/sum(conf)),2)
oveall <-  oob * 100
numva<-finalModeltreeResult1$mtry
as <-as.data.frame(finalModeltreeResult1$confusion)
board[2,2] <-numva
board[2,3] <-oveall
board[2,4] <-(1-as['NEG','class.error'])*100
board[2,5] <-(1-as['TEG','class.error'])*100
board[2,6] <-(1-as['MEG','class.error'])*100
# 完成2个
conf <- finalModeltreeResult2$confusion[,-ncol(finalModeltreeResult2$confusion)]
oob <-  round((sum(diag(conf))/sum(conf)),2)
oveall <-  oob * 100
numva<-finalModeltreeResult2$mtry
as <-as.data.frame(finalModeltreeResult2$confusion)
board[3,2] <-numva
board[3,3] <-oveall
board[3,4] <-(1-as['NEG','class.error'])*100
board[3,5] <-(1-as['TEG','class.error'])*100
board[3,6] <-(1-as['MEG','class.error'])*100
# 完成3个
conf <- finalModeltreeResult3$confusion[,-ncol(finalModeltreeResult3$confusion)]
oob <-  round((sum(diag(conf))/sum(conf)),2)
oveall <-  oob * 100
numva<-finalModeltreeResult3$mtry
as <-as.data.frame(finalModeltreeResult3$confusion)
board[4,2] <-numva
board[4,3] <-oveall
board[4,4] <-(1-as['NEG','class.error'])*100
board[4,5] <-(1-as['TEG','class.error'])*100
board[4,6] <-(1-as['MEG','class.error'])*100
# 完成4个
conf <- finalModeltreeResult4$confusion[,-ncol(finalModeltreeResult4$confusion)]
oob <-  round((sum(diag(conf))/sum(conf)),2)
oveall <-  oob * 100
numva<-finalModeltreeResult4$mtry
as <-as.data.frame(finalModeltreeResult4$confusion)
board[5,2] <-numva
board[5,3] <-oveall
board[5,4] <-(1-as['NEG','class.error'])*100
board[5,5] <-(1-as['TEG','class.error'])*100
board[5,6] <-(1-as['MEG','class.error'])*100
# 完成5个
conf <- finalModeltreeResult5$confusion[,-ncol(finalModeltreeResult5$confusion)]
oob <-  round((sum(diag(conf))/sum(conf)),2)
oveall <-  oob * 100
numva<-finalModeltreeResult5$mtry
as <-as.data.frame(finalModeltreeResult5$confusion)
board[6,2] <-numva
board[6,3] <-oveall
board[6,4] <-(1-as['NEG','class.error'])*100
board[6,5] <-(1-as['TEG','class.error'])*100
board[6,6] <-(1-as['MEG','class.error'])*100
# 完成6个
conf <- finalModeltreeResult6$confusion[,-ncol(finalModeltreeResult6$confusion)]
oob <-  round((sum(diag(conf))/sum(conf)),2)
oveall <-  oob * 100
numva<-finalModeltreeResult6$mtry
as <-as.data.frame(finalModeltreeResult6$confusion)
board[7,2] <-numva
board[7,3] <-oveall
board[7,4] <-(1-as['NEG','class.error'])*100
board[7,5] <-(1-as['TEG','class.error'])*100
board[7,6] <-(1-as['MEG','class.error'])*100
write.csv(board,'/home/jar/zxjar/paper_helper/userdata/0/85/boardOut.csv')
# 完成7个
# tab 4表格完成
# write2html(board,file = 'test.html')
# tbl_summary(board)%>%flextable::save_as_html(path = 'ddddd.html')
# colnames(board)
# htmltools::save_html(file = 'sss.html',board)
# tbl_summary(board)%>%flextable::save_as_html(path = 'Table3_Result.html')
# tb4%>%flextable::save_as_html(path = 'Table3_Result.html')
#importance(agerange317dropnatreeResult)
#table5
#plot(agerange317dropnatreeResult)
# varImpPlot(agerange317dropnatreeResult)
# varImpPlot(agerange317dropnatreeResult,main = "variable Importance score" ,type= 2,sort =T )
# partialPlot(x =agerange317dropnatreeResult,pred.data = finalModel,x.var = LBXCOT )
# plot(finalModel~LBXCOT,data = finalModel)
# prop.table(table(finalModel$smokersize,finalModel$allGroup),margin = 1)

# 最后一个表格复现  只是区分MEG 与 TEG
finalModel5 <- agerange317dropna[,c( 'smokersize','LBXCOT','LBXHCT','URXNAL','allGroup')]
finalModel5 <- missRanger(finalModel5, pmm.k = 2, num.trees = 500)
finalModel5 <- finalModel5[which(finalModel5$allGroup!='NEG'),]
# table(finalModel5$allGroup)
# finalModel5[which(finalModel5$allGroup!='NEG'),]
# dim(finalModel5)
# dim(finalModel5[which(!is.na(finalModel5$allGroup)),])
# dim(finalModel5[which(finalModel5$allGroup!='MEG'),]) + dim(finalModel5[which(finalModel5$allGroup!='TEG'),])

# aaa <- finalModel5[which(finalModel5$allGroup!='NEG'),]


# smokersizes <- recode_factor(finalModel5$smokersize, 
#                             '0 smokers' = 0,
#                             `1 smokers` = 1,
#                             `≥2 smokers` = 2
#                           
#                             
#                             
# )
# finalModel5$smokersize <-smokersizes
# finalModel5$smokersize <-as.numeric(smokersizes)



m1 <- glm(
  allGroup ~ LBXCOT + LBXHCT+URXNAL+smokersize, 
  data = finalModel5,
  
  family = binomial(link = 'logit') 
)

w <-plot_model(m1)
ggsave('table6.png')






